
/*

Input: electoral rolls (for the federal election 2009) and polling place files (all other elections)
	> BTW2009-Strvz-num.csv						[federal 2009]
	> BTW_2013_Wahllokalverzeichnis.xlsx		[federal 2013]
	> KommW_2014_Wahllokalverzeichnis.xlsx		[municipal 2014]
	> BTW_2017_Wahllokalverzeichnis.xlsx		[federal 2017]
	> LTW_2018_Wahllokalverzeichnis.xlsx		[state 2018]
	> EuW_2019_Wahllokalverzeichnis.xlsx		[european 2019]
	> KommW_2020_Wahllokalverzeichnis.xlsx		[municipal 2020]

Output: 
	> tmp/wl_unique.dta [list for geocoding of polling locations]
	> tmp/wl_sb_assignments_raw.dta [assignment of precincts and polling locations for each election]

Tasks:
	>Import adresses of polling locations for all election years
	>Prepare them for geocoding
	>Merge them (and save a list of unique locations)
	>Append all elections to have assignment of precincts and polling locations
	
*/

/* programm to clean street names */
cap program drop clean_streetname
program clean_streetname 

syntax varname(string)

	// general clean
	replace `varlist' = subinstr(`varlist', "EG", "", .)     
	replace `varlist' = subinstr(`varlist', "1.OG", "", .)
	replace `varlist' = subinstr(`varlist',"St.","Sankt",.)
	replace `varlist' = subinstr(`varlist',"Südl.","Südliche",.)
	replace `varlist' = subinstr(`varlist',"Alle","Allee",.)
	replace `varlist' = subinstr(`varlist',"Alleee","Allee",.)	
	replace `varlist' = subinstr(`varlist',"pl.","platz",.)
	replace `varlist' = subinstr(`varlist',"Pl.","Platz",.)		
	replace `varlist' = ustrregexra(`varlist',"[sS]tra(ss|ß)e","strasse")
	replace `varlist' = ustrregexra(`varlist',"[sS]tr\.","strasse")
	
	// BTW13
	replace `varlist' = subinstr(`varlist',"Benningsenstrasse","Bennigsenstrasse",.)
	replace `varlist' = subinstr(`varlist',"Max-Kolmsberger-strasse","Max-Kolmsperger-strasse",.)
	replace `varlist' = subinstr(`varlist',"Agilofingerplatz","Agilolfingerplatz",.)
	replace `varlist' = subinstr(`varlist',"Freseniussstrasse","Freseniusstrasse",.)
	replace `varlist' = subinstr(`varlist',"Max-Pröbstl-strasse","Max-Proebstl-strasse",.)
		
	// BTW17
	replace `varlist' = subinstr(`varlist',"Hiltenpergerstrasse","Hiltenspergerstrasse",.)
	replace `varlist' = subinstr(`varlist',"Rosa-Bavaresa-strasse","Rosa-Bavarese-strasse",.)
	replace `varlist' = subinstr(`varlist',"Max-Proepstl-strasse","Max-Proebstl-strasse",.)
	replace `varlist' = subinstr(`varlist',"Von-der-Pforden-strasse","Von-der-Pfordten-strasse",.)
	
	// KOW20 
	replace `varlist' = subinstr(`varlist',"Petracastrasse","Petrarcastrasse",.)
	replace `varlist' = subinstr(`varlist',"Alb.","Albert",.)
	replace `varlist' = subinstr(`varlist',"Hildegard-Hamm-Brücher","Hildegard-Hamm-Brücher-strasse",.)
	
	replace `varlist' = "Untermenzingerstrasse" if `varlist'=="UntermenzingerStrasse"
	
end

/* programm to harmonize street names to create unique PP ID */
cap program drop harmon_street
program harmon_street 

syntax varname(string)

	replace `varlist' = subinstr(`varlist',"Dr.","Doktor",.)
	replace `varlist' = subinstr(`varlist',"Prof.","Professor",.)
	replace `varlist' = subinstr(`varlist'," ","",.)
	replace `varlist' = lower(`varlist')
	replace `varlist' = subinstr(`varlist',"-","",.)
	replace `varlist' = subinstr(`varlist',".","",.)
	replace `varlist' = subinstr(`varlist',"'","",.)
	replace `varlist' = subinstr(`varlist',"ß","ss",.)
	replace `varlist' = subinstr(`varlist',"ä","ae",.)
	replace `varlist' = subinstr(`varlist',"ö","oe",.)
	replace `varlist' = subinstr(`varlist',"ü","ue",.)

end

/* 1. Import and prep*/

tempfile wl_btw09 wl_btw13 wl_kow14 wl_btw17 wl_ltw18 wl_euw19 wl_kow20
tempfile wl_btw09_uq wl_btw13_uq wl_kow14_uq wl_btw17_uq wl_ltw18_uq wl_euw19_uq wl_kow20_uq

*--------------------*
**** Import BTW09 ****
*--------------------*
* Note: 2009 different from other elections as polling places come from the electoral rolls
import delim using "$rawdata/election_office/electoral_rolls/BTW2009-Strvz-num.csv", ///
	encoding("utf-8") clear colrange(1:7) delim(";")
drop v2 v4 v6

* keep only rows with relevant information
	missings dropobs, force

* sever columns and append below each other
	forvalues j=1(2)7 {
		preserve
			keep v`j'
			rename v`j' v1
			tempfile strassen_btw09_v`j'
			save 	 `strassen_btw09_v`j''
		restore
	}
 clear 
	 // append
	 forvalues j=1(2)7 {
		append using `strassen_btw09_v`j''
	 }

// gen variables for city district (stadtbez) and precint (sb)
	gen stadtbez = subinstr(v1,"S t a d t b e z i r k","",.) if substr(v1, 1, 21)=="S t a d t b e z i r k"
	destring stadtbez, replace
	distinct stadtbez 
	assert r(ndistinct)==25	// assert 25 districts
	
	gen sb_all = subinstr(v1,"W a h l b e z i r k","",.) if substr(v1, 1, 19)=="W a h l b e z i r k"

// identify and extract polling place address
/*
Note: Entries for Polling places follow this structure:
	
	---blank row ---
	Wahlraum:					// Always preceded by "Wahlraum" ("voting room")
	Schule						// Type of Polling place (1 or 2 lines)
	Südliche Auffahrtsallee 82 // Polling place address
	---blank row ---

*/
	gen tmp_strasse = subinstr(v1, " ", "", .)
	// gen dummy that identifies start of Polling place entry
	gen isWL = ustrregexm(tmp_strasse,"\bWahlraum\b")
	// expand dummy to fill total entry based on -blanks- in tmp_strasse
	replace isWL = isWL[_n-1] if !missing(tmp_strasse[_n-1]) &  !missing(tmp_strasse[_n]) 
	// extract PP address (=last row of each entry)
	gen 	wahllokal= v1 if isWL==1 & missing(tmp_strasse[_n+1])
	lab var wahllokal "Raw PP address"
	cap drop tmp*	
	
 // extract wahllokal_type 
	gen wahllokal_type = ""
	replace wahllokal_type = v1 if isWL==1 & strpos(v1,"Wahlraum")==0 & wahllokal==""
	replace wahllokal_type = wahllokal_type[_n-1]+wahllokal_type[_n] if !missing(wahllokal_type[_n])
	replace wahllokal_type = "" if wahllokal_type[_n-1]=="" & wahllokal_type[_n+1]!=""
	replace wahllokal_type = stritrim(wahllokal_type)
	replace wahllokal_type = strtrim(wahllokal_type)
	replace wahllokal_type=wahllokal_type[_n-1] if wahllokal!=""
	lab var wahllokal_type "Raw PP description"
	drop isWL
	
// expand district and precinct IDs
	replace stadtbez = stadtbez[_n-1] if missing(stadtbez)
	replace sb_all = sb_all[_n-1] if missing(sb_all)
	replace sb_all = strtrim(sb_all)
	lab var sb_all "(List of) Precinct ID(s)"

// keep unique combinations of precinct and polling place 
	keep if !missing(wahllokal)
	keep sb_all wahllokal wahllokal_type
	
** Clean PP address; (remove spaces and unnecessary information (e.g., floor))
	replace wahllokal = subinstr(wahllokal, " ", "", .)
	clean_streetname wahllokal

	// extract street name of PP
	gen 	strasse_wl = ustrregexs(1) if ustrregexm(wahllokal,"([^0-9]+)[\.]?([\w-]+)")
	lab var strasse_wl "PP street name cleaned for geocoding"
	// extract address number (including cases e.g., 1-4 and 5a)
	gen 	nummer_wl =  ustrregexs(2) if ustrregexm(wahllokal,"([^0-9]+)[\.]?([\w-]+)")
	lab var nummer_wl "PP address number cleaned for geocoding"
	
	// drop the letters in cases of 5A etc. ; and keep only first number in cases of 2-4
	replace nummer_wl = ustrregexs(1) if ustrregexm(nummer_wl,"(^[0-9]+)")
	
* Harmonize street name to create PP identifier
	gen     tmp = strasse_wl
	harmon_street tmp 

* Gen wahllokal_id := PP identifier = street + number	
	gen		wahllokal_id = tmp + nummer_wl
	lab var wahllokal_id "raw PP ID (string)"
	drop tmp* wahllokal

*save: Assignments of precinct to polling places in BTW09
	save `wl_btw09'

* KEEP distinct polling places for geocoding 
	keep wahllokal_id strasse_wl nummer_wl
	// keep unique polling locations
	duplicates drop wahllokal_id, force
	
	// save: unique PP
	save `wl_btw09_uq'
	
	
*--------------------*
**** Import BTW13 ****
*--------------------*
	import excel  "$rawdata/election_office/polling_places/BTW_2013_Wahllokalverzeichnis.xlsx", sheet(Table 1) firstrow clear
	ren 	wahlbezirk sb_all
	lab var sb_all "(List of) Precinct ID(s)"
	ren 	gebaeudebeschreibung wahllokal_type 
	lab var wahllokal_type "raw polling place type (from electoral office)"
	ren 	strasse strasse_wl
	lab var strasse_wl "PP street name cleaned for geocoding"
	ren 	nummer 	nummer_wl
	lab var nummer_wl "PP address number cleaned for geocoding"
	
	drop 	adresse_wahllokal

*Clean street names for geocoding
	replace strasse_wl = subinstr(strasse_wl,"Pfanzeltpl. 5","Pfanzeltplatz",.)
	replace nummer_wl = subinstr(nummer_wl,"a","5a",.) if strasse_wl=="Pfanzeltplatz"
	
	clean_streetname strasse_wl
	
* Drop letters in street number for geocoding
	// drop the letters in cases of 5A
	replace nummer_wl = ustrregexs(1) if ustrregexm(nummer_wl,"(^[0-9]+)")
	// assert all number as real numbers
	assert real(nummer_wl)!=.
	
* Harmonize street name for PP identifier 
	gen     tmp = strasse_wl
	harmon_street tmp

* Gen wahllokal_id := PP identifier = street + number	
	gen		wahllokal_id = tmp + nummer_wl
	lab var wahllokal_id "raw PP ID (string)"
	drop tmp*
	
*save: Assignments of precincts to polling locations in btw13
	save `wl_btw13'

* KEEP unique polling locations for geocoding 
	keep 			wahllokal_id strasse_wl nummer_wl
	duplicates drop wahllokal_id, force
	save `wl_btw13_uq'

*--------------------*
**** Import KOW14 ****
*--------------------*
	import excel  "$rawdata/election_office/polling_places/KommW_2014_Wahllokalverzeichnis.xlsx", sheet(Table 1) firstrow clear
	rename 	Wahllokalnummer sb_all
	lab var sb_all "(List of) Precinct ID(s)"
	rename 	Strasse strasse_wl
	lab var strasse_wl "PP street cleaned for geocoding"
	rename 	Nr nummer_wl
	lab var nummer_wl "PP address number cleaned for geocoding"
	rename 	Wahllokalname wahllokal_type
	lab var wahllokal_type "raw polling place type (from electoral office)"
	

*Change street names for geocoding
	clean_streetname strasse_wl

*number to string
	tostring nummer_wl, replace
	
* Harmonize street name for PP identifier 
	gen     tmp = strasse_wl
	harmon_street tmp

* Gen wahllokal_id := PP identifier = street + number	
	gen		wahllokal_id = tmp + nummer_wl
	lab var wahllokal_id "raw PP ID (string)"
	drop tmp*
		
*save: Correspondence of PP to precinct 
	save `wl_kow14'

* KEEP unique PP for geocoding 
	keep 			wahllokal_id strasse_wl nummer_wl
	duplicates drop wahllokal_id, force
	// save
	save `wl_kow14_uq'

	
*--------------------*
**** Import BTW17 ****
*--------------------*

	import excel  "$rawdata/election_office/polling_places/BTW_2017_Wahllokalverzeichnis.xlsx", sheet(Table 1) firstrow clear
	ren 	Wahllokalnummer sb_all
	lab var sb_all "(List of) Precinct ID(s)"
	ren 	Strasse strasse_wl
	lab var strasse_wl "PP street cleaned for geocoding"
	ren 	Nr nummer_wl
	lab var nummer_wl "PP address number"
	ren 	Wahllokalname wahllokal_type
	lab var wahllokal_type "raw polling place type (from electoral office)"

*Change street names for geocoding
	clean_streetname strasse_wl

*number to string
	tostring nummer_wl, replace
	
* Harmonize street name for PP identifier 
	gen     tmp = strasse_wl
	harmon_street tmp

* Gen wahllokal_id := PP identifier = street + number	
	gen		wahllokal_id = tmp + nummer_wl
	lab var wahllokal_id "raw PP ID (string)"
	drop tmp*
	
*save: correspondence of precincts to PP
	save `wl_btw17'

* KEEP unique PP for geocoding
	keep wahllokal_id strasse_wl nummer_wl
	order wahllokal_id strasse_wl nummer_wl
	duplicates drop wahllokal_id, force 
	// save
	save `wl_btw17_uq'

*--------------------*
**** Import LTW18 ****
*--------------------*
	import excel  "$rawdata/election_office/polling_places/LTW_2018_Wahllokalverzeichnis.xlsx", sheet(Table 1) firstrow clear
	
	ren 	Wahllokalnummer sb_all
	lab var sb_all "(List of) Precinct ID(s)"
	ren 	Strasse strasse_wl
	lab var strasse_wl "PP street cleaned for geocoding"
	ren 	Nr nummer_wl
	lab var nummer_wl "PP address number"
	ren 	Wahllokalname wahllokal_type
	lab var wahllokal_type "raw polling place type (from electoral office)"	
	
*Change street names for geocoding
	clean_streetname strasse_wl

*number to string
	tostring nummer_wl, replace

* Harmonize street name for PP identifier 
	gen     tmp = strasse_wl
	harmon_street tmp
	
* Gen wahllokal_id := PP identifier = street + number	
	gen		wahllokal_id = tmp + nummer_wl
	lab var wahllokal_id "raw PP ID (string)"
	drop tmp*
	
*save: correspondence of precincts to PP
	save `wl_ltw18'

* KEEP unique PP for geocoding
	keep wahllokal_id strasse_wl nummer_wl

	duplicates drop wahllokal_id, force 
	// save
	save `wl_ltw18_uq'

*--------------------*
**** Import EUW19 ****
*--------------------*
	import excel  "$rawdata/election_office/polling_places/EuW_2019_Wahllokalverzeichnis.xlsx", sheet(Table 1) firstrow clear
	
	ren 	Wahllokalnummer sb_all
	lab var sb_all "(List of) Precinct ID(s)"
	ren 	Strasse strasse_wl
	lab var strasse_wl "PP street cleaned for geocoding"
	ren 	Nr nummer_wl
	lab var nummer_wl "PP address number"
	ren 	Wahllokalname wahllokal_type
	lab var wahllokal_type "raw polling place type (from electoral office)"	

*Change street names for geocoding
	clean_streetname strasse_wl

*number to string
	tostring nummer_wl, replace
	
* Harmonize street name for PP identifier 
	gen     tmp = strasse_wl
	harmon_street tmp
	
* Gen wahllokal_id := PP identifier = street + number	
	gen		wahllokal_id = tmp + nummer_wl
	lab var wahllokal_id "raw PP ID (string)"
	drop tmp*
	
*save: correspondence of precincts to polling locations
	save `wl_euw19'

* KEEP unique PP for geocoding
	keep wahllokal_id strasse_wl nummer_wl
	duplicates drop wahllokal_id, force 
	// save
	save `wl_euw19_uq'	
	

*--------------------*
**** Import KOW20 ****
*--------------------*
	import excel  "$rawdata/election_office/polling_places/KommW_2020_Wahllokalverzeichnis.xlsx", sheet(Table 1) firstrow clear
	
	ren 	Wahllokalnummer sb_all
	lab var sb_all "(List of) Precinct ID(s)"
	ren 	Strasse strasse_wl
	lab var strasse_wl "PP street cleaned for geocoding"
	ren 	Nr nummer_wl
	lab var nummer_wl "PP address number"
	ren 	Wahllokalname wahllokal_type
	lab var wahllokal_type "raw polling place type (from electoral office)"


*Change street names for geocoding
	clean_streetname strasse_wl

*number to string
	tostring nummer_wl, replace
	
* Harmonize street name for PP identifier 
	gen     tmp = strasse_wl
	harmon_street tmp
	
* Gen wahllokal_id := PP identifier = street + number	
	gen		wahllokal_id = tmp + nummer_wl
	lab var wahllokal_id "raw PP ID (string)"
	drop tmp*

*save: correspondence of precinct to polling locations
	save `wl_kow20'	

* KEEP unique PP for geocoding
	keep wahllokal_id strasse_wl nummer_wl
	duplicates drop wahllokal_id, force 
	// save
	save `wl_kow20_uq'	
	
	
	
/* 2. MERGE UNIQUE PP to obtain list polling locations across all elections for geocoding */
 use  `wl_btw13_uq', clear	
 
	merge 1:1 wahllokal_id using `wl_kow14_uq', nogen
	merge 1:1 wahllokal_id using `wl_btw17_uq', nogen
	merge 1:1 wahllokal_id using `wl_ltw18_uq', nogen
	merge 1:1 wahllokal_id using `wl_euw19_uq', nogen
	merge 1:1 wahllokal_id using `wl_kow20_uq', nogen
	merge 1:1 wahllokal_id using `wl_btw09_uq', nogen 
	
	// SAVE: list of UNIQUE PP across all elections for geocoding
	save "$tmp/wl_unique.dta", replace


/* 3. CLEAN assignments of pp to precincts & APPEND PP across elections   */	
tempfile wl_btw09_cl wl_btw13_cl wl_ltw13_cl wl_kow14_cl wl_euw14_cl wl_btw17_cl wl_ltw18_cl wl_euw19_cl wl_kow20_cl
	
foreach election in "btw09" "btw13" "kow14" "btw17" "ltw18" "euw19" "kow20" {

 * Pull: correspondence of precinct to polling places
	use `wl_`election'', clear
	drop strasse_wl nummer_wl
		
 * In most years sb_all contains a list of precincts assigned to one PP, e.g., "1901, 1902, 1903"
 *	 => expand this list so that each precinct has its own row
	
	// split precincts in separate variables (help1 help2 ...)
	gen tmp_list = ustrregexra(sb_all,"[\s+]?\,[\s+]?"," ")
	split tmp_list, p(" ") gen(help)
	
	// extract how many distinct precincts per PP 
	destring help*, replace
	egen help_distinct=rownonmiss(help*)
	
	// gen counter for each entry (wahllokal_id does not uniquely id obs)
	gen tmp_id = _n

	// produce #copies of each PP equal to #of precincts
	expand help_distinct 
	
	// gen sb := precinct ID, and populate variable 
	gen 	sb=. 
	lab var sb "Precinct ID"
	qui su help_distinct
	forvalues j = 1/`r(max)' {
		bys tmp_id: replace sb= help`j' if _n==`j'
	}
	drop tmp* help*
	// Assert: no missings & not ambiguous mappings of precincts to PP
	assert !missing(sb)
	isid sb

	drop sb_all

* save
	isid wahllokal_id sb
	save `wl_`election'_cl'

* Assignments of LTW13 = BTW13 and EU14 = KOW14 => save 
	if "`election'" == "btw13" {
	save `wl_ltw13_cl'
	}
	else if "`election'" == "kow14" {
	save `wl_euw14_cl'
	}
}

 ** APPEND
 clear 
 gen wahl =""
 gen wahl_id = .
 lab var wahl "Election ID (string)"
 lab var wahl_id "Election ID (num)"
 local j=0
/* loop over elections in chronological order */
foreach election in "btw09" "ltw13" "btw13" "kow14" "euw14" "btw17" "ltw18" "euw19" "kow20" {	
	append using `wl_`election'_cl'
	replace wahl = upper("`election'") if missing(wahl)
	replace wahl_id= `j' if missing(wahl_id)
	local ++j
}
	labmask wahl_id, values(wahl)

** SAVE: Assignments of PP to precincts across all elections (w/o geocodes)
save "$tmp/wl_sb_assignments_raw", replace


 